Setup

knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(warning = FALSE, message = FALSE)

# Load libraries
# library(ggchromatic)
library(scater)
library(ggpubr)
library(igraph)
library(dittoSeq)
library(Rphenograph)
library(aricode)
library(viridis)
# Set general input paths to all analysis files
analysis.path <- "/mnt/bb_dqbm_volume/analysis"

# Set path to masks
tonsil.path <- "/mnt/bb_dqbm_volume/data/Tonsil_th152"
lung.path <- "/mnt/bb_dqbm_volume/data/Immucan_lung"
# Import helper fncs
source("../analysis/helpers/helper_fnc.R")

Read Inputs

first, we look at U computed using different training datasets. For this, we read in the results from CISI training on the Tonsil th152 and the Immucan lung dataset (tonsil_vs_lung.py).

## Read results
# Read in all results for tonsil and lung comparison into one dataframe
files <- list.files(analysis.path, "simulation_results.txt",
                    full.names=TRUE, recursive=TRUE)
# TODO: change this line if more files come along and need to exclude
files <- files[grepl(
  "tonsil_vs_lung|Tonsil_th152/training/full/k_|Immucan_lung/training/subset", files)]
res <- lapply(files, read_results, type="res") %>% 
  bind_rows() 

# Get information about best U files
u_files <- res %>% 
  dplyr::filter(dataset==training) %>% 
  dplyr::select(k, training, `Best crossvalidation fold`, datasize) %>% 
  distinct()

# Harmonize all dataset names
patterns <- unique(unlist(lapply(res$training, function(name){
  if(length(str_split(name, "_")[[1]])==1){
    name
  }
})))

res$training <- unlist(lapply(res$training, function(pat){
  patterns[str_detect(pat, fixed(patterns, ignore_case=TRUE))]
}))
res$dataset <- unlist(lapply(res$dataset, function(pat){
  patterns[str_detect(pat, fixed(patterns, ignore_case=TRUE))]
}))

## Read U
u <- lapply(1:(nrow(u_files)), function(i){
  file <- file.path(analysis.path, u_files[i, "training"], "training",
                    u_files[i, "datasize"], 
                    paste0("k_", u_files[i, "k"]),
                    paste0("crossvalidation_", u_files[i, "Best crossvalidation fold"]),
                    "gene_modules.csv")
  u_temp <- read_U(file, "training")
}) %>% bind_rows() %>%
  dplyr::rename(dataset=rep)

## Read X_test and X_simulated
# Specify X_test and X_simulated files to be read in
X.files <- list.files(analysis.path, "X_", full.names=TRUE, recursive=TRUE)
# TODO: change this line if more files come along and need to exclude
X.files <- X.files[grepl(
  "tonsil_vs_lung|Tonsil_th152/training/full/k_|Immucan_lung/training/subset", X.files)]
X.files <- X.files[!grepl("_processed", X.files)]

# Read processed SCE to avoid computing UMAP, TSNE, ... all the time since it is computationally
# expensive
# X.files <- X.files[grepl("_processed", X.files)]

# Read in all sce experiments from saved anndata and save additional info in metadata
# (e.g. which dataset is used in training and testing, which k was used and if it
# is the ground truth or simulated X)
# Remove neg. values and transform counts?
X.list <- lapply(X.files, read_results, type="x")
X.list <- lapply(X.list, function(sce.temp){
  pat <- metadata(sce.temp)
  metadata(sce.temp)$dataset <- patterns[str_detect(pat$dataset, 
                                                    fixed(patterns, ignore_case=TRUE))]
  metadata(sce.temp)$training <- patterns[str_detect(pat$training, 
                                                     fixed(patterns, ignore_case=TRUE))]
  assay.name <- names(assays(sce.temp))
  for (a in assay.name[-1]) {
    assay(sce.temp, a) <- NULL
  }
  assay(sce.temp, "exprs") <- asinh(counts(sce.temp)/1)
  
  sce.temp
})

# Add reduced dimensions
set.seed(220225)
X.list <- lapply(X.list, function(sce){
  sce <- runUMAP(sce, exprs_values="exprs") 
  sce <- runTSNE(sce, exprs_values="exprs")
  sce
})

# Save SCE to avoid computing UMAP, TSNE, ... all the time since it is computationally
# expensive
# lapply(1:(length(X.list)), function(i){saveRDS(X.list[i], 
#                                                paste0(gsub("\\..*", "", X.files[i]),
#                                                       "_processed.rds"))})

## Read masks
# Read in masks for tonsil data
masks.tonsil <- loadImages(file.path(tonsil.path, "steinbock/masks_deepcell"), as.is = TRUE)
mcols(masks.tonsil) <- DataFrame(sample_id=names(masks.tonsil))

# Read in masks for lung data
masks.lung <- loadImages(file.path(lung.path, "masks"), as.is = TRUE)
mcols(masks.lung) <- DataFrame(sample_id=names(masks.lung))

CISI results

Results

# For each different measurement of training results, plot a barplot comparing
# diff. k-sparsities, simulation types and training-test set combinations
for (measure in names(res)[2:10]) {
  cat("##### ", measure, "\n")
  
  # Reorder dataframe for plotting
  data_temp <- res %>%
    dplyr::select(dataset, training, simulation, k, !!measure) %>%
    mutate(group=paste(training, dataset, sep="_"))
  
  # Create barplot
  results.barplot <- plot_cisi_results(data_temp, "group", measure, "k")
  
  print(results.barplot)
  cat("\n\n")
}
Overall pearson

Overall spearman

Gene average

Sample average

Sample dist pearson

Sample dist spearman

Gene dist pearson

Gene dist spearman

Matrix coherence (90th ptile)

U

Heatmaps of U

cor <- plot_U(u, "k", "dataset")
k = 1

k = 3

Module Comparisons

plot_U_membership(u, "k", "dataset")
k = 1

k = 3

Mantel Test between U’s

# Calculate mantel test between U's of tonsil and lung for all k's
mantel <- lapply(cor, function(l){
  mantel_test(l[[1]], l[[2]])$mantel_r
  }) %>%
  as.data.frame(col.names=names(cor), check.names=FALSE)

knitr::kable(mantel, digits=2,
             col.names=paste0("k = ", names(mantel)))
k = 1 k = 3
0.67 0.42

Image results

CD20 of Test Image (Tonsil th152, k=1)

# Subset to training and test of tonsil data
X.plotList <- keep(X.list, function(x){
  metadata(x)$training=="tonsil" & metadata(x)$dataset=="tonsil" & metadata(x)$k=="1"})
names(X.plotList) <- lapply(X.plotList, 
                            function(x){metadata(x)$ground_truth})
# Call plot_cells to get individual plots for each roi for decomposed and true
# results
X.imgList <- plot_cells(X.plotList, masks.tonsil, 
                        "CD20")
# Plot decomposed vs true results for test roi (002)
X.img <- plot_grid(plotlist=append(X.imgList[grepl("20220520_TsH_th152_cisi1_002",
                                                   names(X.imgList))],
                                   X.imgList[grepl("legend",
                                                   names(X.imgList))]),
                   ncol=2, labels=names(X.plotList),
                   label_size=15, hjust=c(-2, -1.5), 
                   vjust=1, scale=0.9)
X.img

CD20 of Test Image Overlayed (Tonsil th152, k=1)

# Rename list of tonsil data for nicer titles in plotting
X.plotListRenamed <- lapply(names(X.plotList), function(n){
  rownames(X.plotList[[n]]) <- paste(rownames(X.plotList[[n]]),
                                     n, sep="\n")
  reducedDims(X.plotList[[n]]) <- NULL
  X.plotList[[n]]
})
names(X.plotListRenamed) <- names(X.plotList)
# Add decomposed and true dataset of tonsil into one SCE
X.overlayed <- do.call("rbind", X.plotListRenamed)

# Define protein of interest
pois <- c("CD20\nsimulated", "CD20\nground_truth")

# Plot cells coloured according to decomposed and true poi values 
# (if similar in both should have a mixture of colours, e.g. orange) 
X.imgDiff <- plotCells(mask=masks.tonsil[unique(colData(X.overlayed)$sample_id)], 
                       object=X.overlayed,
                       cell_id="ObjectNumber", img_id="sample_id", colour_by=pois,
                       return_plot=TRUE,  image_title=list(cex=1),
                       scale_bar=list(cex=1, lwidth=5),
                       legend=list(colour_by.title.cex=0.7, colour_by.labels.cex=0.7))

ggdraw(X.imgDiff$plot)

Counts results

Scatterplots of arcsinh Transformed Counts (Immucan Lung)

Plot of arcsinh transformed counts coloured by defined celltype.

# Subset to dataset trained and tested on the Immucan lung dataset
X.exprsList <- keep(X.list, function(x){
  metadata(x)$training=="lung" & metadata(x)$dataset=="lung"})
names(X.exprsList) <- lapply(X.exprsList,
                             function(x){metadata(x)$ground_truth})
k_s <- unique(unlist(lapply(X.exprsList, function(sce_temp){metadata(sce_temp)$k})))

for (k in k_s) {
  cat("##### k =", k, "\n")
  
  # Subset for k of interest
  X.exprsListK <- keep(X.exprsList, function(x){metadata(x)$k==k})
  
  # Plot asinh transformed counts of proteins of interest of decomposed and true 
  # datasets coloured by different celltypes
  X.Exprs <- plot_grid(plot_exprs(X.exprsListK, "B", "CD3", "CD20"),
                       plot_exprs(X.exprsListK, "BnT", "CD3", "CD20"),
                       plot_exprs(X.exprsListK, "Neutro", "MPO", "CD15"),
                       ncol=1)
  print(X.Exprs)
  cat("\n\n")
}
k = 1

k = 3

## Per protein results

Results per Protein (k=1, arcsinh)

Scatterplot of ground truth vs decomposed results per protein for k=1 and asinh transformed counts.

# Add all X_test counts together into dataframe for easier plotting
aoi <- "exprs"
X.df <- lapply(X.list, function(sce){
  counts.long <- as.data.frame(assays(sce)[[aoi]]) %>%
    mutate(protein=rownames(.)) %>%
    melt() %>%
    dplyr::rename(!!as.symbol(metadata(sce)$ground_truth):=value,
                  cell=variable) %>%
    mutate(k=metadata(sce)$k,
           dataset=paste(metadata(sce)$training, metadata(sce)$dataset, sep="_"))
  
}) %>% bind_rows() %>%
  group_by(protein, cell, k, dataset) %>%
  summarise_all(na.omit)

# For each test/training dataset combination plot for each true vs decomposed
# results in scatter plot and add diagonal and regression line, as well as
# R (pearson correlation coefficient)
for (i in unique(X.df$dataset)) {
  cat("#####", i, "\n")
  
  proteinPlot <- ggscatter(X.df %>%
                             dplyr::filter(k=="1" & dataset==i), 
                           x="ground_truth", y="simulated",
                           add="reg.line",
                           color=pal_npg("nrc")("1"),
                           add.params=list(color=pal_npg("nrc")("4")[4],
                                           size=2)) +
    facet_wrap(~ protein, scales="free", ncol=5) +
    theme_cowplot(10) +
    geom_abline(slope=1, linetype="dashed") +
    stat_cor(size=2)
  
  print(proteinPlot)
  cat("\n\n")
}
lung_lung

tonsil_lung

lung_tonsil

tonsil_tonsil

Correlations per Protein (k=1, arcsinh)

Correlation between ground truth and decomposed results per protein for k=1 and asinh transformed counts.

# Calculate correlations between ground truth and simulated data for each protein
X.cor <- lapply(X.list, function(sce){
  counts.long <- as.data.frame(assays(sce)[["counts"]]) %>%
    mutate(protein=rownames(.)) %>%
    melt() %>%
    dplyr::rename(!!as.symbol(metadata(sce)$ground_truth):=value,
                  cell=variable) %>%
    mutate(k=metadata(sce)$k,
           dataset=paste(metadata(sce)$training, metadata(sce)$dataset, sep="_"))
  
}) %>% bind_rows() %>%
  group_by(protein, cell, k, dataset) %>%
  summarise_all(na.omit) %>%
  group_by(k, dataset, protein) %>%
  summarize(correlation=cor(ground_truth, simulated),
            median=median(ground_truth),
            coef_of_var=sd(ground_truth)/mean(ground_truth),
            relative_var=var(ground_truth)/mean(ground_truth)) 

# Plot correlations for k=1 for each test/training dataset combination
for (i in unique(X.df$dataset)) {
  cat("#####", i, "\n")
  
  proteinCorr <- ggplot(X.cor %>% 
                          dplyr::filter(k=="1" & dataset==i),
                        aes(x=protein, y=correlation, fill=protein)) +
    geom_bar(stat="identity") +
    theme_cowplot() +
    scale_fill_igv() +
    guides(fill=FALSE) +
    coord_flip()
  
  print(proteinCorr)
  cat("\n\n")
}
lung_lung

tonsil_lung

lung_tonsil

tonsil_tonsil

Correlations per Protein vs Protein Characteristics (k=1, arcsinh)

Correlation between ground truth and decomposed results per protein for k=1 and asinh transformed counts compared to protein characteristics of ground truth: Median, max and variance.

# Define characteristics of interest
coi <- c("median", "coef_of_var", "relative_var")

# Plot correlations for k=1 for each test/training dataset combination
for (i in coi) {
  cat("#####", i, "\n")
  
  # proteinChar <- ggscatter(X.cor %>%
  #                            dplyr::filter(k=="1"), 
  #                          x=i, y="correlation",
  #                          add="reg.line",
  #                          color=pal_npg("nrc")("1"),
  #                          add.params=list(color=pal_npg("nrc")("4")[4],
  #                                          size=2)) +
  proteinChar <- ggplot(X.cor %>% dplyr::filter(k=="1"), 
                        aes(x=!!sym(i), y=correlation)) +
    geom_point(color=pal_npg("nrc")("1")) +
    facet_wrap(~ dataset, scales="free", ncol=2) +
    theme_cowplot(10) +
    stat_cor(size=2) +
    stat_smooth(method="lm",
                formula=y ~ log(x),
                se=FALSE,
                color=pal_npg("nrc")("4")[4], size=2)
  
  print(proteinChar)
  cat("\n\n")
}
median

coef_of_var

relative_var

Reduced dimensions and clustering results

Reduced Dimension Plots (k=1, Immucan_lung, arcsinh)

Below the UMAP and TSNE of the arcsinh counts for the Immucan lung dataset are shown using k=1. Red indicates a high expression of the particular protein.

# Find highly correlated proteins that become more correlated afterwards
# X.corProtein <- lapply(X.list, function(sce){
#   counts.long <- as.data.frame(cor(t(assays(sce)[[aoi]]))) %>%
#     mutate(proteinA=rownames(.)) %>%
#     melt() %>%
#     dplyr::rename(!!as.symbol(metadata(sce)$ground_truth):=value,
#                   proteinB=variable) %>%
#     mutate(k=metadata(sce)$k,
#            dataset=paste(metadata(sce)$training, metadata(sce)$dataset, sep="_")) %>%
#     filter(!(proteinA==proteinB)) %>%
#     mutate(combProtein=str_c(pmin(as.character(proteinA), 
#                                   as.character(proteinB)),
#                                   pmax(as.character(proteinA), 
#                                        as.character(proteinB))),
#            proteinB=as.character(proteinB)) %>%
#     filter(!duplicated(combProtein)) %>%
#     dplyr::select(-combProtein) %>%
#     filter(grepl("lung", dataset) & k==1)
#   
# }) %>% bind_rows() %>%
#   group_by(proteinA, proteinB, k, dataset) %>%
#   summarise_all(na.omit) %>%
#   mutate(diff=simulated-ground_truth) %>%
#   group_by(k, dataset) %>%
#   dplyr::filter(grepl("lung", dataset) & !grepl("tonsil", dataset)) %>%
#   dplyr::filter(ground_truth>0.6 & diff>0) %>%
#   arrange(desc(diff))

aoi <- "exprs"
X.exprsList1 <- keep(X.exprsList, function(x){metadata(x)$k=="1"})

for(protein in rownames(X.exprsList1[[1]])){
  cat("#####", protein, "\n")
  
  # Create empty list for plots
  X.redDimlist <- list()
  # Extract for simulated and true results for UMAP and TSNE and plot and
  # colour highly correlated ground truth cells
  for(sce in X.exprsList1){
    for(method in c("UMAP", "TSNE")){
      # sce.temp <- reducedDims(sce)[[method]] %>%
      #   as.data.frame() %>%
      #   mutate(!!X.corProtein$proteinA[1] :=assays(sce)[[aoi]][X.corProtein$proteinA[1],],
      #          !!X.corProtein$proteinB[1] :=assays(sce)[[aoi]][X.corProtein$proteinB[1],])
      # 
      # X.redDimlist <- append(X.redDimlist, 
      #                        list(ggplot(sce.temp, aes(V1, V2, colour=cmy_spec(!!sym(X.corProtein$proteinA[1]), 
      #                                                                          !!sym(X.corProtein$proteinB[1])))) +
      #                               geom_point(size=0.3) + 
      #                               theme_cowplot(7)))
      
      X.redDimlist <- append(X.redDimlist,
      list(dittoDimPlot(sce,
                        var=protein,
                        reduction.use=method,
                        size=0.2,
                        min.color="grey",
                        max.color=pal_npg(("nrc"))("1")) +
             #scale_color_igv() +
             theme_cowplot() +
             theme(legend.position="bottom") +
             guides(color=guide_legend(nrow=1))))
    }
  }
  # Remove empty plot at the beginning and add legend to the end of the plot list
  X.redDimlist <- c(lapply(X.redDimlist, function(p){
    p + theme(legend.position="none")
  }), list(get_legend(X.redDimlist[2])))
  
  # Plot simulated and true reduced dim on top of each other
  X.redDimPlot <- plot_grid(plotlist=X.redDimlist,
                            ncol=2, labels=rep(names(X.exprsList1), each=2),
                            label_size=15, hjust=c(-2, -1.5), vjust=1, scale=0.9)
  
  print(X.redDimPlot)
  cat("\n\n")
}
MPO

SMA

CD38

HLA-DR

CD27

CD303

CD68

CD45RA

CD20

GranzymeB

CD3

CD11c

PD-1

CD14

PD-L1

TCF1/TCF7

CD45RO

FOXP3

ICOS

Ki-67

CD8a

VISTA

CD4

CD15

Cluster overlap between ground truth and simulated data (arcsinh)

To investigate the clustering results of ground truth and simulated data a bit more, we show the overlap of clusters computed using Rphenograph (run with k=47 clusters) individually and compare which clusters overlap using a heatmap.

# Compute clusters for each dataset using Rphenograph
all.clusters <- lapply(X.list, function(sce){
  mat <- t(assay(sce, "exprs"))
  out <- Rphenograph(mat, k=47)
  clusters <- factor(membership(out[[2]])) %>%
    as.data.frame() %>%
    dplyr::rename(!!as.symbol(metadata(sce)$ground_truth):=".") %>%
    mutate(dataset=paste(metadata(sce)$training, metadata(sce)$dataset, sep="_"),
           k=metadata(sce)$k) 
}) %>% bind_rows() %>%
  group_by(k, dataset) %>%
  summarise_all(na.omit)

# Plot heatmaps showing overlap of clusters for each dataset and k
for (i in unique(all.clusters$k)){
  cat("##### k =", i, "\n")
  
  plot.new()
  ht_list <- NULL
  for (d in unique(all.clusters$dataset)){
    temp.matrix <- table(paste("sim", all.clusters %>% 
                                 dplyr::filter(k==i & dataset==d) %>% 
                                 pull(simulated)), 
                         paste("gt", all.clusters %>% 
                                 dplyr::filter(k==i & dataset==d) %>% 
                                 pull(ground_truth))) %>% 
      as.matrix()
    #temp.matrix <- log10(temp.matrix + 10)
    
    ht_list <- append(ht_list, 
                      list(grid.grabExpr(draw(Heatmap(temp.matrix,
                                                      col=magma(100),
                                                      column_title=d,
                                                      row_names_gp=gpar(fontsize = 8),
                                                      column_names_gp=gpar(fontsize=8),
                                                      column_title_gp=gpar(fontsize=10),
                                                      rect_gp=gpar(col="white", lwd=1))))))
  }
  
  print(plot_grid(plotlist=ht_list, nrow=1))
  cat("\n\n")
}
k = 1

k = 3

Adjusted random index (ARI) between clusters (arcsinh)

Finally, we compute the adjusted random index (ARI) between the clustering of the ground truth and simulated data to assess the overlap.

ari.plot <- ggplot(all.clusters %>%
                    group_by(dataset, k) %>%
                    summarize(ARI=ARI(simulated, ground_truth)),
                  aes(x=dataset, y=ARI, fill=k)) +
  geom_bar(position="dodge", stat="identity") +
  theme_cowplot() +
  scale_fill_npg() +
  coord_flip() +
  ylim(0, 1)

print(ari.plot)